library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## <U+221A> tibble 3.0.4 <U+221A> dplyr 1.0.2
## <U+221A> tidyr 1.1.2 <U+221A> stringr 1.4.0
## <U+221A> readr 1.4.0 <U+221A> forcats 0.5.0
## <U+221A> purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(fastDummies)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## The following object is masked from 'package:tidyr':
##
## smiths
library(metan)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## |========================================================|
## | Multi-Environment Trial Analysis (metan) v1.11.0 |
## | Author: Tiago Olivoto |
## | Type 'citation('metan')' to know how to cite metan |
## | Type 'vignette('metan_start')' for a short tutorial |
## | Visit 'https://bit.ly/2TIq6JE' for a complete tutorial |
## |========================================================|
##
## Attaching package: 'metan'
## The following object is masked from 'package:data.table':
##
## :=
## The following object is masked from 'package:forcats':
##
## as_factor
## The following object is masked from 'package:tidyr':
##
## replace_na
require(caTools)
## Loading required package: caTools
library(rpart)
library(gensvm)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(outliers)
The aim of this study is to examine the relationships and distributions of variables using the Diamonds dataset available in R and finally develop a meaningful price estimation model. You can access and review the libraries used in this study from the Libraries section.
data <- diamonds
There are 53940 rows and 10 columns in diamonds dataset
dim(data)
## [1] 53940 10
head(data)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
data %>%glimpse()
## Rows: 53,940
## Columns: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23,...
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, ...
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J,...
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS...
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4,...
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62,...
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340,...
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00,...
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05,...
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39,...
We can see all data summary like median, mean and quartiles
summary(data)
## carat cut color clarity depth
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
## J: 2808 (Other): 2531
## table price x y
## Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
## Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
## Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
## Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
##
## z
## Min. : 0.000
## 1st Qu.: 2.910
## Median : 3.530
## Mean : 3.539
## 3rd Qu.: 4.040
## Max. :31.800
##
There is no missing values
sum(is.na(data))
## [1] 0
we can see table variable’s distribution, we see that the concentration is mostly around 50-60.
ggplot(data) +
aes(x = table) +
geom_histogram(bins = 30L, fill = "#0c4c8a") +
theme_minimal()
We can examine the normal and logarithmic normal distribution of our target variable.
par(mfrow=c(1,2))
qqnorm((diamonds$price),main="Normal Q-Q Plot of Price");qqline((diamonds$price))
qqnorm(log(diamonds$price),main="Normal Q-Q Plot of log Price");qqline(log(diamonds$price))
When we examine the x, y and z variables, we see that the average is at Fair level in the cut variable
data %>%
select(cut,price,x,y,z)%>%
group_by(cut)%>%
summarise(mean_price = mean(price),mean_x = mean(x),mean_y = mean(y),mean_z = mean(z))%>%
arrange(mean_price,desc())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 5
## cut mean_price mean_x mean_y mean_z
## <ord> <dbl> <dbl> <dbl> <dbl>
## 1 Ideal 3458. 5.51 5.52 3.40
## 2 Good 3929. 5.84 5.85 3.64
## 3 Very Good 3982. 5.74 5.77 3.56
## 4 Fair 4359. 6.25 6.18 3.98
## 5 Premium 4584. 5.97 5.94 3.65
When we want to examine the variables with the help of boxplot, we observe that there are outlier values.
data%>%
select(x,cut)%>%
ggplot(aes(x,color = cut))+
geom_boxplot()
data%>%
select(y,cut)%>%
ggplot(aes(y,color = cut))+
geom_boxplot()
data %>%
ggplot(aes(x=cut,y=price, color=cut)) +
geom_boxplot()
When the relationship between carat and price is examined, we can say that the price variable also increases as Premium increases.
d <- diamonds[sample(nrow(diamonds), 1000), ]
fig <- plot_ly(
d, x = ~carat, y = ~price,
# Hover text:
text = ~paste("Price: ", price, '$<br>Cut:', cut),
color = ~carat, size = ~carat
)
fig
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `line.width` does not currently support multiple values.
data %>%
group_by(cut) %>%
summarise(n=n(),
mean= mean(price),
median=median(price),
Q1= quantile(price,0.25),
Q3= quantile(price,0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 6
## cut n mean median Q1 Q3
## <ord> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Fair 1610 4359. 3282 2050. 5206.
## 2 Good 4906 3929. 3050. 1145 5028
## 3 Very Good 12082 3982. 2648 912 5373.
## 4 Premium 13791 4584. 3185 1046 6296
## 5 Ideal 21551 3458. 1810 878 4678.
df <- data[,-c(2,3,4)]
df <- rm.outlier(df, fill = TRUE, median = TRUE, opposite = FALSE)
data[,colnames(df)] <- df
head(data)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
We assign dummies to categorical variables in order to include categorical variables in the model.
data <- dummy_cols(data, select_columns = c("cut","color","clarity"),remove_selected_columns = TRUE)
set.seed(123)
split = sample.split(data$price, SplitRatio = 0.8)
training_set = subset(data, split == TRUE)
test_set = subset(data, split == FALSE)
cormat <- round(cor(training_set),2)
melted_cormat <- melt(cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
print(ggheatmap)
ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 2) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5))
We are scaling on the train and test data that we finally reached. After the scale result is regression run, we leave only the significant variables and run the model again.
training_set = scale(training_set)
test_set = scale(test_set)
d1 <- data.frame(training_set)
d2 <- data.frame(test_set)
regressor = lm(formula = price ~ .,
data = d1)
summary(regressor)
##
## Call:
## lm(formula = price ~ ., data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8431 -0.1494 -0.0461 0.0963 6.6026
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.172e-14 1.341e-03 0.000 1.000000
## carat 1.336e+00 6.724e-03 198.770 < 2e-16 ***
## depth -2.038e-02 2.108e-03 -9.670 < 2e-16 ***
## table -1.472e-02 1.795e-03 -8.199 2.48e-16 ***
## x -2.954e-01 1.434e-02 -20.607 < 2e-16 ***
## y 3.647e-02 1.074e-02 3.396 0.000685 ***
## z -2.899e-02 1.111e-02 -2.610 0.009054 **
## cut_Fair -3.245e-02 1.564e-03 -20.756 < 2e-16 ***
## cut_Good -1.835e-02 1.608e-03 -11.412 < 2e-16 ***
## cut_Very.Good -1.129e-02 1.637e-03 -6.897 5.38e-12 ***
## cut_Premium -8.152e-03 1.762e-03 -4.627 3.72e-06 ***
## cut_Ideal NA NA NA NA
## color_D 1.957e-01 2.382e-03 82.148 < 2e-16 ***
## color_E 2.073e-01 2.640e-03 78.529 < 2e-16 ***
## color_F 1.998e-01 2.608e-03 76.615 < 2e-16 ***
## color_G 1.917e-01 2.725e-03 70.366 < 2e-16 ***
## color_H 1.249e-01 2.480e-03 50.355 < 2e-16 ***
## color_I 6.618e-02 2.181e-03 30.336 < 2e-16 ***
## color_J NA NA NA NA
## clarity_I1 -1.516e-01 1.627e-03 -93.145 < 2e-16 ***
## clarity_SI2 -2.503e-01 3.180e-03 -78.730 < 2e-16 ***
## clarity_SI1 -1.821e-01 3.477e-03 -52.367 < 2e-16 ***
## clarity_VS2 -1.155e-01 3.381e-03 -34.169 < 2e-16 ***
## clarity_VS1 -7.057e-02 2.960e-03 -23.841 < 2e-16 ***
## clarity_VVS2 -2.960e-02 2.524e-03 -11.728 < 2e-16 ***
## clarity_VVS1 -2.158e-02 2.266e-03 -9.521 < 2e-16 ***
## clarity_IF NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2821 on 44210 degrees of freedom
## Multiple R-squared: 0.9205, Adjusted R-squared: 0.9204
## F-statistic: 2.224e+04 on 23 and 44210 DF, p-value: < 2.2e-16
d1 <- d1[,-c(7,12,19,27)]
d2 <- d2[,-c(7,12,19,27)]
regressor = lm(formula = price ~ .,
data = d1)
summary(regressor)
##
## Call:
## lm(formula = price ~ ., data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8513 -0.1494 -0.0463 0.0962 6.5954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.167e-14 1.341e-03 0.000 1.00000
## carat 1.337e+00 6.724e-03 198.766 < 2e-16 ***
## depth -2.390e-02 1.622e-03 -14.736 < 2e-16 ***
## table -1.474e-02 1.795e-03 -8.213 < 2e-16 ***
## x -3.154e-01 1.214e-02 -25.982 < 2e-16 ***
## y 2.759e-02 1.019e-02 2.708 0.00677 **
## cut_Fair -3.246e-02 1.564e-03 -20.758 < 2e-16 ***
## cut_Good -1.828e-02 1.607e-03 -11.370 < 2e-16 ***
## cut_Very.Good -1.134e-02 1.637e-03 -6.926 4.39e-12 ***
## cut_Premium -8.039e-03 1.761e-03 -4.564 5.03e-06 ***
## color_D 1.957e-01 2.382e-03 82.160 < 2e-16 ***
## color_E 2.074e-01 2.640e-03 78.543 < 2e-16 ***
## color_F 1.998e-01 2.608e-03 76.634 < 2e-16 ***
## color_G 1.918e-01 2.725e-03 70.392 < 2e-16 ***
## color_H 1.250e-01 2.480e-03 50.389 < 2e-16 ***
## color_I 6.622e-02 2.182e-03 30.355 < 2e-16 ***
## clarity_I1 -1.515e-01 1.627e-03 -93.106 < 2e-16 ***
## clarity_SI2 -2.503e-01 3.180e-03 -78.706 < 2e-16 ***
## clarity_SI1 -1.820e-01 3.477e-03 -52.344 < 2e-16 ***
## clarity_VS2 -1.155e-01 3.381e-03 -34.149 < 2e-16 ***
## clarity_VS1 -7.053e-02 2.960e-03 -23.825 < 2e-16 ***
## clarity_VVS2 -2.957e-02 2.524e-03 -11.716 < 2e-16 ***
## clarity_VVS1 -2.155e-02 2.266e-03 -9.509 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2821 on 44211 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9204
## F-statistic: 2.325e+04 on 22 and 44211 DF, p-value: < 2.2e-16
pred <- predict(regressor, newdata = d2)
RMSE(pred = pred, obs = d2$price)
## [1] 0.2932916